Core SNP maximum-likelihood tree with metadata

Author

Lakhansing Pardeshi

Published

May 24, 2024

Initial setup

Code
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(configr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(ggnewscale))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(treeio))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(ggbreak))
suppressPackageStartupMessages(library(ggtreeExtra))
suppressPackageStartupMessages(library(scales))

## visualized phylogenetic tree with metadata

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/heatmap_utils.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

treeMethod <- "core_snp_ml"     #ani_upgma, kmer_nj
pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

outDir <- confs$analysis$phylogeny$core_snp_ml$dir
outPrefix <- paste(outDir, "/", treeMethod, sep = "")

Import tree data and required metadata

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId 
)

speciesOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny[[treeMethod]]$files$species_order)
)

metDf <- suppressMessages(
  readr::read_csv(panConf$db$metrics$files$per_genome, col_names = T)
) %>% 
  dplyr::rename_all(
    .funs = ~ stringr::str_replace_all(., "\\W+", "_")
  ) %>% 
  dplyr::rename_all(
    .funs = ~ stringr::str_replace_all(., "_+$", "")
  ) %>% 
  dplyr::select(
    Genome, Gene_count, mRNA_count, CDS_count, tRNA_count, rRNA_count,
    Homology_groups, Singletons, GC_content
  ) %>% 
  dplyr::mutate(genomeId = paste("g_", Genome, sep = "")) %>% 
  dplyr::select(-Genome)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Code
## add species order factor levels to SpeciesName column
sampleInfo %<>% dplyr::mutate(
  SpeciesName = forcats::fct_relevel(SpeciesName, !!!speciesOrder$SpeciesName)
) %>% 
  dplyr::left_join(
    y = metDf, by = "genomeId"
  ) %>% 
  dplyr::group_by(SpeciesName) %>% 
  dplyr::mutate(
    mRNA_med_diff = mRNA_count - median(mRNA_count),
    genome_size_med_diff = length - median(length),
  ) %>% 
  dplyr::ungroup()

Plot the tree

Code
## read tree
rawTree <- import_tree(
  confs$analysis$phylogeny[[treeMethod]]$files$tree_rooted
)

## add data to tree
treeTbl <- tidytree::as_tibble(rawTree) %>%
  dplyr::full_join(y = sampleInfo, by = c("label" = "genomeId")) %>%
  treeio::as.treedata()

Explore the tree topology

Code
rootNode <- rootnode(treeTbl) 

# 
# scale_color_binned(
#   scale_name = "stepsn",
#   palette = function(x) c("green", "red", "black"),
#   breaks = c(70, 90),
#   limits = c(0, 100),
#   na.value = "black",
#   show.limits = TRUE,
#   guide = "colorsteps"
# ) +
# ggnewscale::new_scale_color()

pt_tree <- ggtree::ggtree(tr = treeTbl, color="black", size = 0.7) +
  theme_tree2() +
  ggtree::geom_point2(
    mapping = aes(
      subset = !isTip & node != rootNode, 
      color = cut(bootstrap, c(0, 70, 90, 100))),
    shape = 16, size = 2
    
  ) +
  scale_color_manual(
    values = c("blue", "#f781bf", "black"),
    guide = guide_legend(override.aes = list(size = 6), order = 1),
    name = 'Bootstrap %', 
    breaks = c('(0,70]', '(70,90]'), 
    labels = expression(BP < "70%", "70%" <= BP * " < 90%"),
    na.value = "transparent"
  ) +
  ggnewscale::new_scale_fill()

One of the branch is very long and hence trimming its length to 0.5 for visualization purpose.

Code
longestBranch <- which(pt_tree$data$x == max(pt_tree$data$x))
longestBranchTip <- pt_tree$data$label[longestBranch]
trimLongestBranchTo <- 0.47
pt_tree$data$x[longestBranch] <- trimLongestBranchTo

Representative species phylogenetic tree

Code
# speciesDend <- ape::keep.tip(
#   phy = ape::ladderize(rawTree@phylo), tip = representativeSpecies
# ) %>%
#   ape::chronos() %>%
#   ape::as.hclust.phylo() %>%
#   as.dendrogram()


speciesTree <- representative_genomes_tree(phy = rawTree@phylo, metadata = sampleInfo) %>% 
  tidytree::as_tibble() %>% 
  dplyr::left_join(y = sampleInfo, by = c("label" = "genomeId")) %>%
  treeio::as.treedata() %>% 
  ggtree::ggtree(size = 2) +
  ggtree::xlim(NA, 1) +
  ggtree::geom_tiplab(
    mapping = aes(label = SpeciesName),
    align = TRUE, size = 5, fontface = "italic"
  )

speciesTree$data$x[which(speciesTree$data$label == longestBranchTip)] <- trimLongestBranchTo

ggsave(
  filename = paste(outPrefix, ".representative_species_tree.pdf", sep = ""),
  plot = speciesTree,
  width = 6, height = 8
)

speciesTree

View tree with metadata

Code
# theme for legend adjustment
theme_legend <- ggplot2::theme(
  legend.justification = c(0, 1),
  legend.box = "vertical",
  legend.text = element_text(size = 16),
  legend.title = element_text(size = 18, face = "bold")
)

# theme for ggplot
pt_theme <- ggplot2::theme_bw() +
  theme(
    axis.text.y = element_blank(),
    panel.grid = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title = element_blank()
  ) +
  theme_legend

pt_tree2 <- pt_tree +
  theme_tree() +
  geom_treescale(
    x = 0, y = nrow(sampleInfo) * 0.5,
    fontsize = 8, linesize = 2, offset = 4
  ) +
  # layout_fan(angle = 10) +
  scale_y_continuous(expand=c(0, 10)) +
  ggnewscale::new_scale_color() +
  ggtree::geom_tiplab(
    mapping = aes(label = NA),
    align = TRUE, linesize = 0.25
  )  +
  ggnewscale::new_scale_color() +
  theme_legend
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Code
spKeyDf <- get_species_key_data(
  genomes = sampleInfo$genomeId, speciesInfo = sampleInfo, type = 'wide'
) %>% 
  tibble::as_tibble(rownames = "genomeId") %>% 
  tidyr::pivot_longer(
    cols = -genomeId,
    names_to = 'sp', values_to = "spCol"
  ) %>% 
  dplyr::mutate(sp = forcats::fct_relevel(sp, !!!speciesOrder$SpeciesName))



# species key
pt_spKey <- dplyr::rename(spKeyDf, label = genomeId) %>% 
  ggplot2::ggplot(mapping = aes(x = sp, y = label)) +
  ggplot2::geom_tile(
    mapping = aes(fill = spCol)
  ) +
  scale_fill_manual(
    values = c("0" = "grey95", "1" = "black"),
    guide = "none"
  ) +
  pt_theme +
  theme(
    axis.text.x = element_blank(),
    axis.ticks = element_blank()
  )

# blackleg pcr
pt_pcr <- dplyr::select(sampleInfo, label = genomeId, virulence_pcr) %>% 
  ggplot2::ggplot(mapping = aes(x = "pcr", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = virulence_pcr)) +
  # ggplot2::geom_point(mapping = aes(color = virulence_pcr), size = 1) +
  scale_fill_manual(
    name = "PCR BL diagnosis",
    values = c("red", "green"),
    breaks = c("positive", "negative"),
    na.value = alpha("white", 0),
    guide = guide_legend(override.aes = list(size = 6), order = 2)
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# blackleg field trial
pt_bl <- dplyr::select(sampleInfo, label = genomeId, virulence) %>% 
  ggplot2::ggplot(mapping = aes(x = "pcr", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = virulence)) +
  # ggplot2::geom_point(mapping = aes(color = virulence), size = 1, shape = 17) +
  scale_fill_manual(
    name='Field trial',
    values = c("red", "green", "black"),
    breaks = c("virulent", "avirulent", "inconclusive"),
    label = c("BL-causing", "BL non-causing", "Inconclusive"),
    na.value = alpha("white", 0),
    guide = guide_legend(override.aes = list(size = 6), order = 3)
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# continent
pt_loc <- dplyr::select(sampleInfo, label = genomeId, continent) %>% 
  ggplot2::ggplot(mapping = aes(x = "location", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = continent)) +
  scale_fill_manual(
    name = "Continent of isolation",
    values = c("#E6AC00", "#D12E96", "#5E8B3A", "#AA741D", "#A5E7E1", "#244360"),
    na.value = alpha("white", 1),
    guide = guide_legend(override.aes = list(size = 6), order = 4),
    na.translate = FALSE
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# genome size
pt_len <- dplyr::select(sampleInfo, label = genomeId, length) %>% 
  ggplot2::ggplot(mapping = aes(x = length - median(length), y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  scale_x_continuous(
    breaks = scales::breaks_pretty(n = 3),
    labels = scales::label_number(scale_cut = c(kb = 1000, mb = 1000000))
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )

# mRNA count difference with species median
pt_gene_count <- dplyr::select(sampleInfo, label = genomeId, mRNA_count) %>% 
  ggplot2::ggplot(mapping = aes(x = mRNA_count - median(mRNA_count), y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  scale_x_continuous(
    breaks = scales::breaks_pretty(n = 3)
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )

# %GC
pt_gc <- dplyr::select(sampleInfo, label = genomeId, GC_content) %>% 
  ggplot2::ggplot(mapping = aes(x = GC_content, y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  ggplot2::coord_cartesian(xlim = c(50, NA)) +
  scale_x_continuous(
    n.breaks = 2,
    breaks = scales::breaks_pretty(n = 1)
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )


pt_tree3 <- pt_spKey %>% aplot::insert_left(pt_tree2, width = 6) %>% 
  aplot::insert_right(pt_pcr, width = 0.2) %>% 
  aplot::insert_right(pt_bl, width = 0.2) %>% 
  aplot::insert_right(pt_loc, width = 0.2) %>% 
  # aplot::insert_right(pt_len, width = 0.8) %>% 
  aplot::insert_right(pt_gene_count, width = 0.8) %>%
  aplot::insert_right(pt_gc, width = 0.5)

ggsave(
  filename = paste(outPrefix, ".tree_plot.pdf", sep = ""), plot = pt_tree3,
  width = 15, height = 16
)

Species key in the above figure marks the species on tree from left to right: P. cacticida, P. fontis, P. colocasium, Pectobacterium sp. CFBP8739, P. c. subsp. carotovorum, P. aroidearum, P. betavasculorum, P. zantedeschiae, P. peruviense, P. atrosepticum, P. polonicum, P. punjabense, P. wasabiae, P. parmentieri, P. actinidiae, P. odoriferum, P. carotovorum, P. versatile, P. aquaticum, P. quasiaquaticum, P. parvum, P. polaris, P. brasiliense.

Back to top